### Variance estimation for low income proportion: SOEP 2012 ###
### Standard Variant ###

### Seed ###

  set.seed(17231)

### Load packages ###

  library(foreign)
  
### General setup ###  
  
  sampling.fraction <- 0.025
  scale <- 0.54 # Scale value
  beta <- 0.5 # Which quintile for poverty line
  alpha <- 0.6 # Which fraction of quintile for poverty line
  
### Read data ###
  
  dat1 <- read.dta("/home/christian/Dokumente/Daten/equivar/wv2012.dta",convert.factors=FALSE)
  dat1 <- dat1[,c("d1110612","d1110712","i1110212","l1110212","h1110112")] # Only keep variables of interest
  names(dat1) <- c("family.size","no.children","income","ow","no.child.14") # Rename variables
  # Generate dummies
  dat1$child14 <- 0
  dat1$child14[dat1$no.child.14>0] <- 1
  dat1$west <- 0
  dat1$ost <- 0
  dat1$west[dat1$ow==1] <- 1
  dat1$ost[dat1$ow==2] <- 1
  #dat1 <- dat1[dat1$income!=0,] # Drop obs. with zero income
  N <- dim(dat1)[1] # Population size
  N.ost <- sum(dat1$ost) # Population size east Germany
  N.west <- sum(dat1$west) # Population size west Germany
  N.ch <- sum(dat1$child14) # Population size hh with children
  n <- round(sampling.fraction*N,digits=0) # Sample size
  
### Calculate true lip ###

  # Equivalized income
  dat1$equiv.weight <- dat1$family.size^scale
  dat1$equiv.inc <- dat1$income/dat1$equiv.weight
  # Poverty line 
  true.poverty.line <- alpha*quantile(dat1$equiv.inc,probs=beta) 
  # Poverty indicator 
  dat1$poor <- 0
  dat1$poor[dat1$equiv.inc<=true.poverty.line] <- 1
  # Low income proportion 
  true.lip <- mean(dat1$poor)
  # Low income proportion West/East
  true.lip.ost <- mean(dat1$poor[dat1$ow==2])
  true.lip.west <- mean(dat1$poor[dat1$ow==1])
  # Low income proportion households with children
  true.lip.ch <- mean(dat1$poor[dat1$child14==1])
  
### Objects for results ###  
  
  LIP <- numeric(sims) # For estimates of lip
  LIP.ost <- numeric(sims) # For estimates of lip east
  LIP.west <- numeric(sims) # For estimates of lip west
  LIP.ch <- numeric(sims) # For estimates of lip hh with children
  
  SVAR <- numeric(sims) # Variance estimates of IF 
  SVAR.ost <- numeric(sims) # Variance estimates of IF 
  SVAR.west <- numeric(sims) # Variance estimates of IF
  SVAR.ch <- numeric(sims) # Variance estimates of IF
  
  SVAR.cov <- numeric(sims) # Coverage of 95%-CI
  SVAR.cov.ost <- numeric(sims) # Coverage of 95%-CI
  SVAR.cov.west <- numeric(sims) # Coverage of 95%-CI
  SVAR.cov.ch <- numeric(sims) # Coverage of 95%-CI
  
### Simulation runs  
  
  for(i in 1:sims) {
    
    # Sample
    dat <- dat1[sample(1:N,size=n,rep=F),]
    
    # Calculate lip
    poverty.line <- alpha*quantile(dat$equiv.inc,probs=beta) 
    dat$poor <- 0
    dat$poor[dat$equiv.inc<=poverty.line] <- 1
    lip <- mean(dat$poor)
    lip.ost <- mean(dat$poor[dat$ow==2])
    lip.west <- mean(dat$poor[dat$ow==1])
    lip.ch <- mean(dat$poor[dat$child14==1])
    
    # Density estimates (bandwidth according to Berger/Skinner after Silverman)
    bandwidth <- 0.79 * (quantile(dat$equiv.inc,probs=0.75)-quantile(dat$equiv.inc,probs=0.25))* n^(-0.2)
    tmp <- density(dat$equiv.inc,bw=bandwidth)
    f1 <- tmp$y[which.min(abs(tmp$x-poverty.line))]
    f2 <- tmp$y[which.min(abs(tmp$x-poverty.line/alpha))]
    
    # Variance via influence function full sample
    dat$below.median <- 0
    dat$below.median[dat$equiv.inc<=median(dat$equiv.inc)] <- 1
    dat$z1 <- 1/N * (dat$poor - lip)
    dat$z2 <- -alpha * 1/N * (f1/f2) * (dat$below.median-beta)
    dat$z <- dat$z1+dat$z2
    # Variance estimate
    s_var <- var(dat$z)*(N*(N-n))/n 
    # Covergae of 95%-CI
    SVAR.cov[i] <- true.lip < (lip+1.96*sqrt(s_var)) & true.lip > (lip-1.96*sqrt(s_var))

    # For east 
    dat$z1 <- 1/N.ost * (dat$poor - lip)*dat$ost
    dat$z <- dat$z1+dat$z2
    s_var.ost <- var(dat$z)*(N*(N-n))/n
    # Coverage of 95%-CI
    SVAR.cov.ost[i] <- true.lip.ost < (lip.ost+1.96*sqrt(s_var.ost)) & true.lip.ost > (lip.ost-1.96*sqrt(s_var.ost))
    
    # For west 
    dat$z1 <- 1/N.west * (dat$poor - lip)*dat$west
    dat$z <- dat$z1+dat$z2
    s_var.west <- var(dat$z)*(N*(N-n))/n
    # Coverage of 95%-CI
    SVAR.cov.west[i] <- true.lip.west < (lip.west+1.96*sqrt(s_var.west)) & true.lip.west > (lip.west-1.96*sqrt(s_var.west))
    
    # For hh with children
    dat$z1 <- 1/N.ch * (dat$poor - lip)*dat$ch
    dat$z <- dat$z1+dat$z2
    s_var.ch <- var(dat$z)*(N*(N-n))/n
    # Coverage of 95%-CI
    SVAR.cov.ch[i] <- true.lip.ch < (lip.ch+1.96*sqrt(s_var.ch)) & true.lip.ch > (lip.ch-1.96*sqrt(s_var.ch))
  
    # Store results  
    LIP[i] <- lip
    LIP.ost[i] <- lip.ost
    LIP.west[i] <- lip.west
    LIP.ch[i] <- lip.ch
    
    SVAR[i] <- s_var
    SVAR.ost[i] <- s_var.ost
    SVAR.west[i] <- s_var.west
    SVAR.ch[i] <- s_var.ch
    
  }

### Assessment ###

  # "True" results
  true.var <- sum((LIP-true.lip)^2)/sims
  true.sd <- sqrt(true.var)
  
  true.var.ost <- sum((LIP.ost-true.lip.ost)^2)/sims
  true.sd.ost <- sqrt(true.var.ost)
  
  true.var.west <- sum((LIP.west-true.lip.west)^2)/sims
  true.sd.west <- sqrt(true.var.west)
  
  true.var.ch <- sum((LIP.ch-true.lip.ch)^2)/sims
  true.sd.ch <- sqrt(true.var.ch)
  
  # Relative bias
  (mean(SVAR)-true.var)/true.var
  (mean(sqrt(SVAR))-true.sd)/true.sd
  
  (mean(SVAR.ost)-true.var.ost)/true.var.ost
  (mean(sqrt(SVAR.ost))-true.sd.ost)/true.sd.ost
  
  (mean(SVAR.west)-true.var.west)/true.var.west
  (mean(sqrt(SVAR.west))-true.sd.west)/true.sd.west
  
  (mean(SVAR.ch)-true.var.ch)/true.var.ch
  (mean(sqrt(SVAR.ch))-true.sd.ch)/true.sd.ch
  
  # Coverage of 95%-CIs
  sum(SVAR.cov)/sims
  sum(SVAR.cov.ost)/sims
  sum(SVAR.cov.west)/sims
  sum(SVAR.cov.ch)/sims
  
### Save results ###
  
  datei <- paste("/home/christian/Dokumente/Equiv Variance/Ergebnisse/sim1_",sims,"_",round(sampling.fraction*100),".rda",sep="")
  save(list=ls(),file=datei)
  